1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 1997, by Sun Microsystems, Inc. 24 * All rights reserved. 25 */ 26 27 #include <stdio.h> 28 #include <stdlib.h> 29 #include <errno.h> 30 #include <sys/types.h> 31 32 #include "tab_lookup.h" /* table lookup data types */ 33 34 #define MSB 0x80 /* most significant bit */ 35 #define ONEBYTE 0xff /* right most byte */ 36 37 enum _USTATE { U0, U1, U11, U2, U3, U4 }; 38 39 40 int get_ibm_by_utf(_icv_state *st, char c1, char c2, int *unidx, 41 unsigned long *ibm_code); 42 43 int bisearch(unsigned long val, _icv_state *st, int n); 44 45 int utf8_to_ibm(int unidx, unsigned long ibm_code, char *buf, 46 size_t buflen, _icv_state *st); 47 48 /* 49 * Actual conversion; called from iconv() 50 * Input is UTF-8 data. 51 * first convert to UCS2 52 */ 53 size_t 54 _icv_iconv(_icv_state *st, char **inbuf, size_t *inbytesleft, 55 char **outbuf, size_t *outbytesleft) 56 { 57 /* 58 * Actual conversion; called from iconv() 59 */ 60 /*========================================================= 61 * 62 * State Machine for interpreting UTF8 code 63 * 64 *========================================================= 65 * 66 * 3 byte unicode 67 * +----->------->-------+ 68 * | | 69 * ^ v 70 * | 2 byte U2 ---> U3 71 * | unicode v 72 * +------> U0 -------> U1 +-------->U4---+ 73 * ^ ascii | | ^ | 74 * | | +-------->--------->--------+ | 75 * | v v 76 * +----<---+-----<------------<------------<------------+ 77 * 78 * +----<---+-----<------------<------------<------------+ 79 * 80 *=========================================================*/ 81 82 char c1 = '\0', c2 = '\0'; 83 int n, unidx; 84 unsigned long ibm_code; 85 86 #ifdef DEBUG 87 fprintf(stderr, "========== iconv(): UTF8 --> IBM ==========\n"); 88 #endif 89 90 if (st == NULL) { 91 errno = EBADF; 92 return ((size_t) -1); 93 } 94 95 if (inbuf == NULL || *inbuf == NULL) { /* Reset request. */ 96 st->ustate = U0; 97 st->_errno = 0; 98 st->shift = SHIFT_IN; 99 return ((size_t) 0); 100 } 101 102 st->_errno = 0; /* reset internal errno */ 103 errno = 0; /* reset external errno */ 104 105 /* a state machine for interpreting UTF8 code */ 106 while (*inbytesleft > 0 && *outbytesleft > 0) { 107 switch (st->ustate) { 108 case U0: 109 /* it is ascii, convert it immediately */ 110 if ((**inbuf & MSB) == 0) { /* ASCII */ 111 st->ustate = U4; 112 st->keepc[0] = **inbuf; 113 c1 = 0x0; 114 c2 = **inbuf; 115 continue; 116 } else { /* Chinese character */ 117 if ((**inbuf & 0xe0) == 0xc0) { /* 2 byte unicode */ 118 st->ustate = U1; 119 st->keepc[0] = **inbuf; 120 } else if ((**inbuf & 0xf0) == 0xe0) { /* 3 byte */ 121 st->ustate = U2; 122 st->keepc[0] = **inbuf; 123 } else { /* illegal unicode */ 124 /* st->_errno = errno = EINVAL; */ 125 /* possible UNICODE ko_KR-UTF8 */ 126 c1 =st->keepc[0] = **inbuf; 127 st->ustate = U11; 128 break; 129 } 130 } 131 break; 132 case U1: /* 2 byte unicode */ 133 if ((**inbuf & 0xc0) == MSB) { 134 st->ustate = U4; 135 st->keepc[1] = **inbuf; 136 c1 = (st->keepc[0]&0x1c)>>2; 137 c2 = ((st->keepc[0]&0x03)<<6) | ((**inbuf)&0x3f); 138 #ifdef DEBUG 139 fprintf(stderr, "UTF8: %02x%02x --> ", 140 st->keepc[0]&ONEBYTE, st->keepc[1]&ONEBYTE); 141 #endif 142 continue; /* should not advance *inbuf */ 143 } else { 144 st->_errno = errno = EINVAL; 145 } 146 break; 147 case U11: /* 3 byte unicode - 2nd byte */ 148 c2 =st->keepc[1] = **inbuf; 149 st->ustate = U4; 150 continue; 151 break; 152 case U2: /* 3 byte unicode - 2nd byte */ 153 if ((**inbuf & 0xc0) == MSB) { 154 st->ustate = U3; 155 st->keepc[1] = **inbuf; 156 } else { 157 st->_errno = errno = EINVAL; 158 } 159 break; 160 case U3: /* 3 byte unicode - 3rd byte */ 161 if ((**inbuf & 0xc0) == MSB) { 162 st->ustate = U4; 163 st->keepc[2] = **inbuf; 164 c1 = ((st->keepc[0]&0x0f)<<4) | 165 ((st->keepc[1]&0x3c)>>2); 166 c2 = ((st->keepc[1]&0x03)<<6) | ((**inbuf)&0x3f); 167 #ifdef DEBUG 168 fprintf(stderr, "UTF8: %02x%02x%02x --> ", st->keepc[0]&ONEBYTE, 169 st->keepc[1]&ONEBYTE, **inbuf&ONEBYTE); 170 #endif 171 continue; /* should not advance *inbuf */ 172 } else { 173 st->_errno = errno = EINVAL; 174 } 175 break; 176 case U4: 177 n = get_ibm_by_utf(st, c1, c2, &unidx, &ibm_code); 178 if (n != 0) { /* legal unicode;illegal Big5 */ 179 st->_errno = errno = EILSEQ; 180 break; 181 } 182 183 n = utf8_to_ibm(unidx, ibm_code, 184 *outbuf, *outbytesleft, st); 185 if (n > 0) { 186 (*outbuf) += n; 187 (*outbytesleft) -= n; 188 } else { 189 st->_errno = errno; 190 return((size_t)-1); 191 } 192 st->ustate = U0; 193 st->_errno = 0; 194 break; 195 default: /* should never come here */ 196 st->_errno = errno = EILSEQ; 197 st->ustate = U0; /* reset state */ 198 break; 199 } 200 201 (*inbuf)++; 202 (*inbytesleft)--; 203 204 if (st->_errno) { 205 #ifdef DEBUG 206 fprintf(stderr, "!!!!!\tst->_errno = %d\tst->ustate = %d\n", 207 st->_errno, st->ustate); 208 #endif 209 break; 210 } 211 212 if (errno) 213 return((size_t)-1); 214 } 215 216 if (*outbytesleft == 0) { 217 errno = E2BIG; 218 return((size_t)-1); 219 } 220 return (*inbytesleft); 221 } 222 223 224 /* 225 * Match IBM code by UTF8 code; 226 * Return: = 0 - match from Unicode to IBM found 227 * = 1 - match from Unicode to IBM NOT found 228 * 229 * Since binary search of the UTF8 to IBM table is necessary, might as well 230 * return index and IBM code matching to the unicode. 231 */ 232 int get_ibm_by_utf(st, c1, c2, unidx, ibm_code) 233 _icv_state *st; 234 char c1, c2; 235 int *unidx; 236 unsigned long *ibm_code; 237 { 238 unsigned long unicode; 239 240 unicode = (unsigned long) ((c1 & ONEBYTE) << 8) + (c2 & ONEBYTE); 241 *unidx = bisearch(unicode, st, st->table_size); 242 if ((*unidx) >= 0) 243 { 244 if ( st->left_to_right ) 245 *ibm_code = st->table[*unidx].right_code; 246 else 247 *ibm_code = st->table[*unidx].left_code; 248 } 249 #ifdef DEBUG 250 fprintf(stderr, "Unicode=%04x, idx=%5d, IBM=%x ", unicode, *unidx, *ibm_code); 251 #endif 252 253 return(0); 254 } 255 256 257 /* 258 * ISO/IEC 10646 (Unicode) --> IBM 259 * Unicode --> UTF8 (FSS-UTF) 260 * (File System Safe Universal Character Set Transformation Format) 261 * Return: > 0 - converted with enough space in output buffer 262 * = 0 - no space in outbuf 263 */ 264 int utf8_to_ibm(int unidx, unsigned long ibm_code, char *buf, size_t buflen, 265 _icv_state *st) 266 { 267 unsigned long val; /* IBM value */ 268 char c1, c2, ibm_str[3]; 269 270 if (unidx < 0) /* no match from UTF8 to IBM */ 271 ibm_code = (unsigned long)NON_ID_CHAR; 272 273 { 274 val = ibm_code & 0xffff; 275 c1 = (char) ((val & 0xff00) >> 8); 276 c2 = (char) (val & 0xff); 277 } 278 279 /* it is single byte ascii */ 280 if ( c1 == 0x0 ) { 281 if ( st->shift == SHIFT_OUT ) { 282 if (buflen < 2) { 283 errno = E2BIG; 284 return 0; 285 } 286 *buf = SHIFT_IN; 287 *(buf+1) = c2; 288 st->shift = SHIFT_IN; 289 return 2; 290 } 291 if (buflen < 1) { 292 errno = E2BIG; 293 return 0; 294 } 295 *buf = c2; 296 return 1; 297 } 298 299 /* it is the first two bytes character */ 300 if ( st->shift == SHIFT_IN ) { 301 if (buflen < 3) { 302 errno = E2BIG; 303 return 0; 304 } 305 *buf = SHIFT_OUT; 306 st->shift = SHIFT_OUT; 307 *(buf+1) = c1; 308 *(buf+2) = c2; 309 return 3; 310 } 311 312 *buf = ibm_str[0] = c1; 313 *(buf+1) = ibm_str[1] = c2; 314 ibm_str[2] = '\0'; 315 316 #ifdef DEBUG 317 fprintf(stderr, "\t->%x %x<-\n", *buf, *(buf+1)); 318 #endif 319 320 321 if (buflen < 2) { 322 errno = E2BIG; 323 return(0); 324 } 325 326 return(2); 327 } 328